# 这个代码用于对已经下载好的GDELT 1.0数据库进行筛选
import os
import pandas as pd
from datetime import datetime, timedelta

# 设置开始和结束日期
date_start = '20140101'
date_end = '20240201'
key1 = 'TWN'
key2 = 'CHN'

# 初始化累积的DataFrame
cumulative_data = pd.DataFrame()

def process_data(date, cumulative_data):
    filepath = f'F:/GDELT_1.0/{date}.export.csv'

    if os.path.exists(filepath):
        df = pd.read_csv(filepath, delimiter="\t", header=None, low_memory=False)
        # 选择相关列和筛选数据
        filtered_data = df[(df.iloc[:, 7] == key1) & (df.iloc[:, 17] == key2)]
        # 合并数据
        cumulative_data = pd.concat([cumulative_data, filtered_data], ignore_index=True)
    return cumulative_data


# 遍历日期范围内的每一天，并更新累积数据
start_date = datetime.strptime(date_start, "%Y%m%d")
end_date = datetime.strptime(date_end, "%Y%m%d")
current_date = start_date

while current_date <= end_date:
    cumulative_data = process_data(current_date.strftime("%Y%m%d"), cumulative_data)
    current_date += timedelta(days=1)

# 在累积数据上进行统计和计算
if not cumulative_data.empty:
    # 统计QuadClass
    quad_class_counts = cumulative_data.iloc[:, 29].value_counts().to_dict()

    # 打印加权平均值
    weighted_goldstein = (cumulative_data.iloc[:, 30] * cumulative_data.iloc[:, 31]).sum() / cumulative_data.iloc[:,
                                                                                             31].sum()
    weighted_avg_tone = (cumulative_data.iloc[:, 34] * cumulative_data.iloc[:, 31]).sum() / cumulative_data.iloc[:,
                                                                                            31].sum()

    print(f"在 {date_start} 到 {date_end}, {key1} 对 {key2}:")
    print(f"QuadClass 统计: {quad_class_counts}")
    print(f"加权 GoldsteinScale 平均值: {weighted_goldstein}")
    print(f"加权 AvgTone 平均值: {weighted_avg_tone}")
    # 输出到Excel
    output_data = cumulative_data[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25, 26, 27, 28, 29, 30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 54,55,56,57]].copy()
    output_data.columns = ['GlobalEventID','Day','MonthYear','Year','FractionDate','Actor1Code','Actor1Name','Actor1CountryCode','Actor1KnownGroupCode','Actor1EthnicCode','Actor1Religion1Code','Actor1Religion2Code','Actor1Type1Code','Actor1Type2Code','Actor1Type3Code','Actor2Code','Actor2Name','Actor2CountryCode','Actor2KnownGroupCode','Actor2EthnicCode','Actor2Religion1Code','Actor2Religion2Code','Actor2Type1Code','Actor2Type2Code','Actor2Type3Code','IsRootEvent', 'EventCode', 'EventBaseCode', 'EventRootCode', 'QuadClass', 'GoldsteinScale', 'NumMentions','NumSources','NumArticles','AvgTone','Actor1Geo_Type','Actor1Geo_Fullname','Actor1Geo_ADM1Code','Actor1Geo_ADM2Code','ActionGeo_Lat','ActionGeo_Long','Actor1Geo_FeatureID','Actor2Geo_Type','Actor2Geo_Fullname','Actor2Geo_ADM1Code','Actor2Geo_ADM2Code','Actor2Geo_Lat','Actor2Geo_Long','Actor2Geo_FeatureID','ActionGeo_Type','ActionGeo_Fullname','ActionGeo_CountryCode','ActionGeo_ADM1Code','ActionGeo_Lat','ActionGeo_Long','ActionGeo_FeatureID','DATEADDED','SourceURL']
    output_data.to_excel('GDELT_output.xlsx', index=False)

    print(f"相关数据已输出到Excel。")
else:
    print(f"在 {date_start} 到 {date_end} 没有找到关于 {key1} 对 {key2} 的数据。")



